## Both small enough to easily put in memory, so just downloaded them to my r project folder and can read them in directly
batmen <- read_csv("batter-names.csv")
stats <- read_csv("2019-statcast.csv")
# Let's get a general sense of both data frames using skim and glimpse
skim(batmen) # No whitespace in batter name, which is nice
| Name | batmen |
| Number of rows | 990 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| batter_name | 0 | 1 | 8 | 21 | 0 | 989 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| key_mlbam | 0 | 1 | 580678.6 | 67528.06 | 282332 | 543108.2 | 596058 | 637863 | 676606 | ▁▁▃▅▇ |
glimpse(batmen) # batter_name is last name. first name all lower case
## Rows: 990
## Columns: 2
## $ key_mlbam <dbl> 547989, 660670, 542436, 642715, 613534, 571431, 451192, 50…
## $ batter_name <chr> "abreu, josé", "acuna, ronald", "adames, cristhian", "adam…
skim(stats) # No whitespace in batter name, which is nice
| Name | stats |
| Number of rows | 743356 |
| Number of columns | 67 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| Date | 1 |
| numeric | 51 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| pitch_type | 7004 | 0.99 | 2 | 2 | 0 | 12 | 0 |
| player_name | 0 | 1.00 | 8 | 22 | 0 | 829 | 0 |
| events | 554778 | 0.25 | 4 | 25 | 0 | 26 | 0 |
| description | 0 | 1.00 | 4 | 23 | 0 | 13 | 0 |
| des | 0 | 1.00 | 14 | 400 | 0 | 125971 | 0 |
| game_type | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| stand | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| p_throws | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| home_team | 0 | 1.00 | 2 | 3 | 0 | 30 | 0 |
| away_team | 0 | 1.00 | 2 | 3 | 0 | 30 | 0 |
| type | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| inning_topbot | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
| pitch_name | 7004 | 0.99 | 6 | 15 | 0 | 12 | 0 |
| if_fielding_alignment | 5415 | 0.99 | 8 | 13 | 0 | 3 | 0 |
| of_fielding_alignment | 5415 | 0.99 | 8 | 14 | 0 | 3 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| game_date | 0 | 1 | 2019-03-20 | 2019-10-30 | 2019-06-29 | 210 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| release_speed | 7311 | 0.99 | 88.69 | 5.93 | 50.60 | 84.50 | 89.80 | 93.40 | 104.30 | ▁▁▂▇▃ |
| release_pos_x | 7332 | 0.99 | -0.76 | 1.96 | -5.38 | -2.16 | -1.53 | 1.23 | 4.98 | ▁▇▂▃▁ |
| release_pos_z | 7332 | 0.99 | 5.90 | 0.50 | 0.11 | 5.63 | 5.92 | 6.22 | 10.54 | ▁▁▇▂▁ |
| batter | 0 | 1.00 | 571465.57 | 67473.20 | 282332.00 | 519222.00 | 592567.00 | 622569.00 | 676606.00 | ▁▁▃▇▇ |
| pitcher | 0 | 1.00 | 571800.29 | 67839.45 | 282332.00 | 519326.00 | 592789.00 | 621381.00 | 677976.00 | ▁▁▅▆▇ |
| zone | 7332 | 0.99 | 9.24 | 4.19 | 1.00 | 6.00 | 11.00 | 13.00 | 14.00 | ▂▃▂▃▇ |
| balls | 0 | 1.00 | 0.89 | 0.97 | 0.00 | 0.00 | 1.00 | 2.00 | 4.00 | ▇▅▃▂▁ |
| strikes | 0 | 1.00 | 0.90 | 0.83 | 0.00 | 0.00 | 1.00 | 2.00 | 2.00 | ▇▁▆▁▆ |
| game_year | 0 | 1.00 | 2019.00 | 0.00 | 2019.00 | 2019.00 | 2019.00 | 2019.00 | 2019.00 | ▁▁▇▁▁ |
| pfx_x | 7332 | 0.99 | -0.15 | 0.86 | -2.66 | -0.87 | -0.24 | 0.53 | 2.38 | ▁▇▇▆▁ |
| pfx_z | 7332 | 0.99 | 0.65 | 0.74 | -2.21 | 0.22 | 0.79 | 1.25 | 5.39 | ▁▇▇▁▁ |
| plate_x | 7332 | 0.99 | 0.04 | 0.85 | -5.03 | -0.54 | 0.04 | 0.62 | 6.29 | ▁▃▇▁▁ |
| plate_z | 7332 | 0.99 | 2.25 | 0.96 | -4.13 | 1.62 | 2.25 | 2.87 | 12.21 | ▁▇▆▁▁ |
| on_3b | 675314 | 0.09 | 572890.71 | 66953.16 | 405395.00 | 519317.00 | 592663.00 | 623912.00 | 676606.00 | ▂▂▅▇▆ |
| on_2b | 607801 | 0.18 | 571908.66 | 67662.88 | 400085.00 | 519203.00 | 592663.00 | 623520.00 | 676606.00 | ▂▃▅▇▆ |
| on_1b | 518745 | 0.30 | 570370.28 | 67451.54 | 400085.00 | 519048.00 | 592444.00 | 622110.00 | 676606.00 | ▂▃▅▇▆ |
| outs_when_up | 0 | 1.00 | 0.98 | 0.82 | 0.00 | 0.00 | 1.00 | 2.00 | 2.00 | ▇▁▇▁▇ |
| inning | 0 | 1.00 | 5.03 | 2.69 | 1.00 | 3.00 | 5.00 | 7.00 | 19.00 | ▇▇▂▁▁ |
| hc_x | 616189 | 0.17 | 126.56 | 41.17 | 2.00 | 102.43 | 125.66 | 153.76 | 246.88 | ▁▃▇▅▁ |
| hc_y | 616189 | 0.17 | 120.52 | 44.69 | 2.30 | 85.14 | 122.97 | 157.66 | 230.96 | ▁▅▅▇▁ |
| vx0 | 7332 | 0.99 | 2.32 | 5.99 | -19.92 | -2.55 | 4.12 | 6.70 | 18.68 | ▁▃▃▇▁ |
| vy0 | 7332 | 0.99 | -128.94 | 8.61 | -151.33 | -135.78 | -130.60 | -122.86 | -73.14 | ▃▇▂▁▁ |
| vz0 | 7332 | 0.99 | -4.22 | 3.00 | -19.92 | -6.25 | -4.36 | -2.34 | 15.52 | ▁▃▇▁▁ |
| ax | 7332 | 0.99 | -2.52 | 10.30 | -30.80 | -11.23 | -2.74 | 5.29 | 28.55 | ▁▇▇▅▁ |
| ay | 7332 | 0.99 | 25.62 | 3.75 | 7.63 | 22.80 | 25.62 | 28.38 | 46.79 | ▁▃▇▁▁ |
| az | 7332 | 0.99 | -23.48 | 8.74 | -51.43 | -29.51 | -22.62 | -16.10 | 30.39 | ▂▇▆▁▁ |
| sz_top | 7332 | 0.99 | 3.37 | 0.19 | 2.50 | 3.25 | 3.37 | 3.49 | 4.31 | ▁▂▇▁▁ |
| sz_bot | 7332 | 0.99 | 1.60 | 0.11 | 0.75 | 1.54 | 1.60 | 1.67 | 2.44 | ▁▁▇▁▁ |
| hit_distance_sc | 548516 | 0.26 | 165.22 | 123.61 | 0.00 | 30.00 | 174.00 | 260.00 | 526.00 | ▇▆▅▃▁ |
| launch_speed | 539851 | 0.27 | 83.88 | 14.22 | 7.60 | 73.90 | 82.90 | 95.40 | 125.30 | ▁▁▅▇▂ |
| launch_angle | 539850 | 0.27 | 16.62 | 28.68 | -89.00 | -3.00 | 18.00 | 37.00 | 89.00 | ▁▂▇▇▂ |
| effective_speed | 4889 | 0.99 | 88.04 | 8.04 | 0.00 | 83.90 | 89.50 | 93.30 | 105.00 | ▁▁▁▃▇ |
| release_spin_rate | 20084 | 0.97 | 2255.36 | 323.34 | 419.00 | 2103.00 | 2274.00 | 2444.00 | 3741.00 | ▁▁▇▃▁ |
| release_extension | 7332 | 0.99 | 5.99 | 0.50 | 0.60 | 5.70 | 6.00 | 6.30 | 9.90 | ▁▁▇▅▁ |
| game_pk | 0 | 1.00 | 566671.45 | 4109.91 | 564734.00 | 565456.00 | 566197.00 | 566932.00 | 599377.00 | ▇▁▁▁▁ |
| pitcher.1 | 0 | 1.00 | 571800.29 | 67839.45 | 282332.00 | 519326.00 | 592789.00 | 621381.00 | 677976.00 | ▁▁▅▆▇ |
| release_pos_y | 7332 | 0.99 | 54.51 | 0.50 | 50.59 | 54.17 | 54.50 | 54.83 | 59.94 | ▁▅▇▁▁ |
| launch_speed_angle | 618174 | 0.17 | 3.20 | 1.26 | 1.00 | 2.00 | 3.00 | 4.00 | 6.00 | ▇▆▆▁▂ |
| at_bat_number | 0 | 1.00 | 39.51 | 23.42 | 1.00 | 19.00 | 39.00 | 58.00 | 148.00 | ▇▇▅▁▁ |
| pitch_number | 0 | 1.00 | 2.93 | 1.74 | 1.00 | 1.00 | 3.00 | 4.00 | 16.00 | ▇▂▁▁▁ |
| home_score | 0 | 1.00 | 2.36 | 2.68 | 0.00 | 0.00 | 2.00 | 4.00 | 21.00 | ▇▂▁▁▁ |
| away_score | 0 | 1.00 | 2.56 | 2.84 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| bat_score | 0 | 1.00 | 2.44 | 2.71 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| fld_score | 0 | 1.00 | 2.48 | 2.81 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_away_score | 0 | 1.00 | 2.58 | 2.84 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_home_score | 0 | 1.00 | 2.37 | 2.68 | 0.00 | 0.00 | 2.00 | 4.00 | 21.00 | ▇▂▁▁▁ |
| post_bat_score | 0 | 1.00 | 2.47 | 2.72 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_fld_score | 0 | 1.00 | 2.48 | 2.81 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| spin_axis | 7332 | 0.99 | 178.36 | 68.67 | 0.00 | 134.00 | 194.00 | 225.00 | 360.00 | ▂▃▇▅▁ |
| delta_home_win_exp | 2 | 1.00 | 0.00 | 0.03 | -0.74 | 0.00 | 0.00 | 0.00 | 0.91 | ▁▁▇▁▁ |
| delta_run_exp | 67 | 1.00 | 0.00 | 0.25 | -1.66 | -0.07 | -0.02 | 0.04 | 3.70 | ▁▇▁▁▁ |
glimpse(stats) # batter_name is last name. first name all lower case
## Rows: 743,356
## Columns: 67
## $ pitch_type <chr> "SL", "FF", "FF", "FF", "SL", "FF", "FF", "FF", …
## $ game_date <date> 2019-10-30, 2019-10-30, 2019-10-30, 2019-10-30,…
## $ release_speed <dbl> 87.9, 95.9, 96.5, 96.0, 86.7, 95.8, 95.8, 95.7, …
## $ release_pos_x <dbl> -2.65, -2.77, -2.68, -2.65, -2.73, -2.91, -2.76,…
## $ release_pos_z <dbl> 5.50, 5.52, 5.42, 5.55, 5.59, 5.42, 5.55, 5.61, …
## $ player_name <chr> "Hudson, Daniel", "Hudson, Daniel", "Hudson, Dan…
## $ batter <dbl> 488726, 488726, 488726, 488726, 488726, 488726, …
## $ pitcher <dbl> 543339, 543339, 543339, 543339, 543339, 543339, …
## $ events <chr> "strikeout", NA, NA, NA, NA, NA, NA, "strikeout"…
## $ description <chr> "swinging_strike", "foul", "ball", "foul", "ball…
## $ zone <dbl> 14, 7, 14, 9, 14, 7, 14, 5, 3, 5, 5, 5, 4, 12, 1…
## $ des <chr> "Michael Brantley strikes out swinging.", "Micha…
## $ game_type <chr> "W", "W", "W", "W", "W", "W", "W", "W", "W", "W"…
## $ stand <chr> "L", "L", "L", "L", "L", "L", "L", "R", "R", "R"…
## $ p_throws <chr> "R", "R", "R", "R", "R", "R", "R", "R", "R", "R"…
## $ home_team <chr> "HOU", "HOU", "HOU", "HOU", "HOU", "HOU", "HOU",…
## $ away_team <chr> "WSH", "WSH", "WSH", "WSH", "WSH", "WSH", "WSH",…
## $ type <chr> "S", "S", "B", "S", "B", "S", "B", "S", "S", "S"…
## $ balls <dbl> 3, 3, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 3, 3, …
## $ strikes <dbl> 2, 2, 2, 1, 1, 0, 0, 2, 1, 0, 1, 0, 0, 0, 2, 1, …
## $ game_year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
## $ pfx_x <dbl> 0.02, -0.57, -0.66, -0.81, -0.05, -0.83, -0.70, …
## $ pfx_z <dbl> 0.21, 1.52, 1.40, 1.50, 0.47, 1.49, 1.48, 1.47, …
## $ plate_x <dbl> 0.88, -0.47, 1.68, 0.75, 1.27, -0.62, 1.34, 0.23…
## $ plate_z <dbl> 1.03, 1.92, 1.35, 2.05, 2.17, 1.61, 1.83, 2.69, …
## $ on_3b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ on_2b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ on_1b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ outs_when_up <dbl> 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 2, 2, 1, 1, …
## $ inning <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, …
## $ inning_topbot <chr> "Bot", "Bot", "Bot", "Bot", "Bot", "Bot", "Bot",…
## $ hc_x <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 142.38, …
## $ hc_y <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 148.54, …
## $ vx0 <dbl> 8.472927, 7.236909, 12.923798, 10.686962, 9.6335…
## $ vy0 <dbl> -127.7980, -139.2124, -139.7033, -139.1332, -126…
## $ vz0 <dbl> -5.1816225, -7.1013608, -8.2305549, -6.8976200, …
## $ ax <dbl> -1.384471, -8.940442, -11.485524, -12.790017, -2…
## $ ay <dbl> 23.93211, 32.68395, 32.45965, 31.42231, 21.92107…
## $ az <dbl> -29.091156, -11.563514, -12.442450, -11.667275, …
## $ sz_top <dbl> 3.35, 3.35, 3.53, 3.35, 3.59, 3.53, 3.80, 3.35, …
## $ sz_bot <dbl> 1.40, 1.56, 1.63, 1.56, 1.63, 1.63, 1.78, 1.56, …
## $ hit_distance_sc <dbl> NA, 276, NA, 380, NA, NA, NA, NA, NA, NA, NA, NA…
## $ launch_speed <dbl> NA, 87.9, NA, 105.3, NA, NA, NA, NA, NA, NA, 80.…
## $ launch_angle <dbl> NA, 33, NA, 25, NA, NA, NA, NA, NA, NA, 69, NA, …
## $ effective_speed <dbl> 87.8, 94.4, 95.3, 94.9, 87.0, 95.1, 94.6, 94.9, …
## $ release_spin_rate <dbl> 2461, 2572, 2637, 2598, 2598, 2544, 2539, 2541, …
## $ release_extension <dbl> 6.1, 5.7, 5.9, 5.9, 6.2, 5.9, 5.8, 5.8, 6.0, 6.0…
## $ game_pk <dbl> 599377, 599377, 599377, 599377, 599377, 599377, …
## $ pitcher.1 <dbl> 543339, 543339, 543339, 543339, 543339, 543339, …
## $ release_pos_y <dbl> 54.42, 54.83, 54.55, 54.60, 54.28, 54.56, 54.69,…
## $ launch_speed_angle <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3, NA, 3…
## $ at_bat_number <dbl> 79, 79, 79, 79, 79, 79, 79, 78, 78, 78, 77, 77, …
## $ pitch_number <dbl> 7, 6, 5, 4, 3, 2, 1, 3, 2, 1, 2, 1, 2, 1, 6, 5, …
## $ pitch_name <chr> "Slider", "4-Seam Fastball", "4-Seam Fastball", …
## $ home_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ away_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ bat_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, …
## $ fld_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, …
## $ post_away_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ post_home_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ post_bat_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, …
## $ post_fld_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, …
## $ if_fielding_alignment <chr> "Infield shift", "Infield shift", "Infield shift…
## $ of_fielding_alignment <chr> "Standard", "Standard", "Standard", "Standard", …
## $ spin_axis <dbl> 175, 201, 205, 208, 186, 209, 205, 205, 214, 210…
## $ delta_home_win_exp <dbl> -0.001, -0.001, -0.001, -0.001, -0.001, -0.001, …
## $ delta_run_exp <dbl> -0.137, 0.000, 0.038, -0.033, 0.024, -0.021, 0.0…
I noticed “des” has batter names in it, so might be able to extract that, but want to make sure there’s not something way simpler than that if I RTFM.
Lol, there’s something way simpler, matching on (I think!) ‘batter’ in ‘stats’ and ‘key_mlbam’ in ‘batmen’
# Doing a left_join since we want to retain all the info in the stats database and have batters show up wherever there's an event involving them (which could/should happen multiple times)
rise_batmen <- stats %>%
left_join(batmen, by = c("batter" = "key_mlbam"))
# The glimpse provides a sanity check with the des column against the batter_name column, where we see brantley in both places
glimpse(rise_batmen)
## Rows: 743,356
## Columns: 68
## $ pitch_type <chr> "SL", "FF", "FF", "FF", "SL", "FF", "FF", "FF", …
## $ game_date <date> 2019-10-30, 2019-10-30, 2019-10-30, 2019-10-30,…
## $ release_speed <dbl> 87.9, 95.9, 96.5, 96.0, 86.7, 95.8, 95.8, 95.7, …
## $ release_pos_x <dbl> -2.65, -2.77, -2.68, -2.65, -2.73, -2.91, -2.76,…
## $ release_pos_z <dbl> 5.50, 5.52, 5.42, 5.55, 5.59, 5.42, 5.55, 5.61, …
## $ player_name <chr> "Hudson, Daniel", "Hudson, Daniel", "Hudson, Dan…
## $ batter <dbl> 488726, 488726, 488726, 488726, 488726, 488726, …
## $ pitcher <dbl> 543339, 543339, 543339, 543339, 543339, 543339, …
## $ events <chr> "strikeout", NA, NA, NA, NA, NA, NA, "strikeout"…
## $ description <chr> "swinging_strike", "foul", "ball", "foul", "ball…
## $ zone <dbl> 14, 7, 14, 9, 14, 7, 14, 5, 3, 5, 5, 5, 4, 12, 1…
## $ des <chr> "Michael Brantley strikes out swinging.", "Micha…
## $ game_type <chr> "W", "W", "W", "W", "W", "W", "W", "W", "W", "W"…
## $ stand <chr> "L", "L", "L", "L", "L", "L", "L", "R", "R", "R"…
## $ p_throws <chr> "R", "R", "R", "R", "R", "R", "R", "R", "R", "R"…
## $ home_team <chr> "HOU", "HOU", "HOU", "HOU", "HOU", "HOU", "HOU",…
## $ away_team <chr> "WSH", "WSH", "WSH", "WSH", "WSH", "WSH", "WSH",…
## $ type <chr> "S", "S", "B", "S", "B", "S", "B", "S", "S", "S"…
## $ balls <dbl> 3, 3, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 3, 3, …
## $ strikes <dbl> 2, 2, 2, 1, 1, 0, 0, 2, 1, 0, 1, 0, 0, 0, 2, 1, …
## $ game_year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
## $ pfx_x <dbl> 0.02, -0.57, -0.66, -0.81, -0.05, -0.83, -0.70, …
## $ pfx_z <dbl> 0.21, 1.52, 1.40, 1.50, 0.47, 1.49, 1.48, 1.47, …
## $ plate_x <dbl> 0.88, -0.47, 1.68, 0.75, 1.27, -0.62, 1.34, 0.23…
## $ plate_z <dbl> 1.03, 1.92, 1.35, 2.05, 2.17, 1.61, 1.83, 2.69, …
## $ on_3b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ on_2b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ on_1b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ outs_when_up <dbl> 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 2, 2, 1, 1, …
## $ inning <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, …
## $ inning_topbot <chr> "Bot", "Bot", "Bot", "Bot", "Bot", "Bot", "Bot",…
## $ hc_x <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 142.38, …
## $ hc_y <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 148.54, …
## $ vx0 <dbl> 8.472927, 7.236909, 12.923798, 10.686962, 9.6335…
## $ vy0 <dbl> -127.7980, -139.2124, -139.7033, -139.1332, -126…
## $ vz0 <dbl> -5.1816225, -7.1013608, -8.2305549, -6.8976200, …
## $ ax <dbl> -1.384471, -8.940442, -11.485524, -12.790017, -2…
## $ ay <dbl> 23.93211, 32.68395, 32.45965, 31.42231, 21.92107…
## $ az <dbl> -29.091156, -11.563514, -12.442450, -11.667275, …
## $ sz_top <dbl> 3.35, 3.35, 3.53, 3.35, 3.59, 3.53, 3.80, 3.35, …
## $ sz_bot <dbl> 1.40, 1.56, 1.63, 1.56, 1.63, 1.63, 1.78, 1.56, …
## $ hit_distance_sc <dbl> NA, 276, NA, 380, NA, NA, NA, NA, NA, NA, NA, NA…
## $ launch_speed <dbl> NA, 87.9, NA, 105.3, NA, NA, NA, NA, NA, NA, 80.…
## $ launch_angle <dbl> NA, 33, NA, 25, NA, NA, NA, NA, NA, NA, 69, NA, …
## $ effective_speed <dbl> 87.8, 94.4, 95.3, 94.9, 87.0, 95.1, 94.6, 94.9, …
## $ release_spin_rate <dbl> 2461, 2572, 2637, 2598, 2598, 2544, 2539, 2541, …
## $ release_extension <dbl> 6.1, 5.7, 5.9, 5.9, 6.2, 5.9, 5.8, 5.8, 6.0, 6.0…
## $ game_pk <dbl> 599377, 599377, 599377, 599377, 599377, 599377, …
## $ pitcher.1 <dbl> 543339, 543339, 543339, 543339, 543339, 543339, …
## $ release_pos_y <dbl> 54.42, 54.83, 54.55, 54.60, 54.28, 54.56, 54.69,…
## $ launch_speed_angle <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3, NA, 3…
## $ at_bat_number <dbl> 79, 79, 79, 79, 79, 79, 79, 78, 78, 78, 77, 77, …
## $ pitch_number <dbl> 7, 6, 5, 4, 3, 2, 1, 3, 2, 1, 2, 1, 2, 1, 6, 5, …
## $ pitch_name <chr> "Slider", "4-Seam Fastball", "4-Seam Fastball", …
## $ home_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ away_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ bat_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, …
## $ fld_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, …
## $ post_away_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ post_home_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ post_bat_score <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, …
## $ post_fld_score <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, …
## $ if_fielding_alignment <chr> "Infield shift", "Infield shift", "Infield shift…
## $ of_fielding_alignment <chr> "Standard", "Standard", "Standard", "Standard", …
## $ spin_axis <dbl> 175, 201, 205, 208, 186, 209, 205, 205, 214, 210…
## $ delta_home_win_exp <dbl> -0.001, -0.001, -0.001, -0.001, -0.001, -0.001, …
## $ delta_run_exp <dbl> -0.137, 0.000, 0.038, -0.033, 0.024, -0.021, 0.0…
## $ batter_name <chr> "brantley, michael", "brantley, michael", "brant…
# To double check though, and to be on theme, let's do a random slice and make sure those columns agree. (If they don't my model might be screwed before I even start since that means we're not matching info correctly) They do!
rise_batmen %>%
slice(3300) %>%
dplyr::select(des, batter_name)
## # A tibble: 1 x 2
## des batter_name
## <chr> <chr>
## 1 Howie Kendrick grounds into a double play, third baseman Tommy… kendrick, how…
The last time I watched baseball regularly the fielders stood in the same place for every batter, so domain expertise is not a thing for me. But just based on missing data/type of data I don’t want to try to predict events, hit_distance, or pitch_type (Though maybe the last one would be fine, but I don’t have to so I won’t!). I’m choosing delta_home_win_exp because it’s only missing twice in this training data, and it feels cool to predict the difference in win expectancy before and after each plate appearance.
skim(rise_batmen)
| Name | rise_batmen |
| Number of rows | 743356 |
| Number of columns | 68 |
| _______________________ | |
| Column type frequency: | |
| character | 16 |
| Date | 1 |
| numeric | 51 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| pitch_type | 7004 | 0.99 | 2 | 2 | 0 | 12 | 0 |
| player_name | 0 | 1.00 | 8 | 22 | 0 | 829 | 0 |
| events | 554778 | 0.25 | 4 | 25 | 0 | 26 | 0 |
| description | 0 | 1.00 | 4 | 23 | 0 | 13 | 0 |
| des | 0 | 1.00 | 14 | 400 | 0 | 125971 | 0 |
| game_type | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| stand | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| p_throws | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| home_team | 0 | 1.00 | 2 | 3 | 0 | 30 | 0 |
| away_team | 0 | 1.00 | 2 | 3 | 0 | 30 | 0 |
| type | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| inning_topbot | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
| pitch_name | 7004 | 0.99 | 6 | 15 | 0 | 12 | 0 |
| if_fielding_alignment | 5415 | 0.99 | 8 | 13 | 0 | 3 | 0 |
| of_fielding_alignment | 5415 | 0.99 | 8 | 14 | 0 | 3 | 0 |
| batter_name | 0 | 1.00 | 8 | 21 | 0 | 989 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| game_date | 0 | 1 | 2019-03-20 | 2019-10-30 | 2019-06-29 | 210 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| release_speed | 7311 | 0.99 | 88.69 | 5.93 | 50.60 | 84.50 | 89.80 | 93.40 | 104.30 | ▁▁▂▇▃ |
| release_pos_x | 7332 | 0.99 | -0.76 | 1.96 | -5.38 | -2.16 | -1.53 | 1.23 | 4.98 | ▁▇▂▃▁ |
| release_pos_z | 7332 | 0.99 | 5.90 | 0.50 | 0.11 | 5.63 | 5.92 | 6.22 | 10.54 | ▁▁▇▂▁ |
| batter | 0 | 1.00 | 571465.57 | 67473.20 | 282332.00 | 519222.00 | 592567.00 | 622569.00 | 676606.00 | ▁▁▃▇▇ |
| pitcher | 0 | 1.00 | 571800.29 | 67839.45 | 282332.00 | 519326.00 | 592789.00 | 621381.00 | 677976.00 | ▁▁▅▆▇ |
| zone | 7332 | 0.99 | 9.24 | 4.19 | 1.00 | 6.00 | 11.00 | 13.00 | 14.00 | ▂▃▂▃▇ |
| balls | 0 | 1.00 | 0.89 | 0.97 | 0.00 | 0.00 | 1.00 | 2.00 | 4.00 | ▇▅▃▂▁ |
| strikes | 0 | 1.00 | 0.90 | 0.83 | 0.00 | 0.00 | 1.00 | 2.00 | 2.00 | ▇▁▆▁▆ |
| game_year | 0 | 1.00 | 2019.00 | 0.00 | 2019.00 | 2019.00 | 2019.00 | 2019.00 | 2019.00 | ▁▁▇▁▁ |
| pfx_x | 7332 | 0.99 | -0.15 | 0.86 | -2.66 | -0.87 | -0.24 | 0.53 | 2.38 | ▁▇▇▆▁ |
| pfx_z | 7332 | 0.99 | 0.65 | 0.74 | -2.21 | 0.22 | 0.79 | 1.25 | 5.39 | ▁▇▇▁▁ |
| plate_x | 7332 | 0.99 | 0.04 | 0.85 | -5.03 | -0.54 | 0.04 | 0.62 | 6.29 | ▁▃▇▁▁ |
| plate_z | 7332 | 0.99 | 2.25 | 0.96 | -4.13 | 1.62 | 2.25 | 2.87 | 12.21 | ▁▇▆▁▁ |
| on_3b | 675314 | 0.09 | 572890.71 | 66953.16 | 405395.00 | 519317.00 | 592663.00 | 623912.00 | 676606.00 | ▂▂▅▇▆ |
| on_2b | 607801 | 0.18 | 571908.66 | 67662.88 | 400085.00 | 519203.00 | 592663.00 | 623520.00 | 676606.00 | ▂▃▅▇▆ |
| on_1b | 518745 | 0.30 | 570370.28 | 67451.54 | 400085.00 | 519048.00 | 592444.00 | 622110.00 | 676606.00 | ▂▃▅▇▆ |
| outs_when_up | 0 | 1.00 | 0.98 | 0.82 | 0.00 | 0.00 | 1.00 | 2.00 | 2.00 | ▇▁▇▁▇ |
| inning | 0 | 1.00 | 5.03 | 2.69 | 1.00 | 3.00 | 5.00 | 7.00 | 19.00 | ▇▇▂▁▁ |
| hc_x | 616189 | 0.17 | 126.56 | 41.17 | 2.00 | 102.43 | 125.66 | 153.76 | 246.88 | ▁▃▇▅▁ |
| hc_y | 616189 | 0.17 | 120.52 | 44.69 | 2.30 | 85.14 | 122.97 | 157.66 | 230.96 | ▁▅▅▇▁ |
| vx0 | 7332 | 0.99 | 2.32 | 5.99 | -19.92 | -2.55 | 4.12 | 6.70 | 18.68 | ▁▃▃▇▁ |
| vy0 | 7332 | 0.99 | -128.94 | 8.61 | -151.33 | -135.78 | -130.60 | -122.86 | -73.14 | ▃▇▂▁▁ |
| vz0 | 7332 | 0.99 | -4.22 | 3.00 | -19.92 | -6.25 | -4.36 | -2.34 | 15.52 | ▁▃▇▁▁ |
| ax | 7332 | 0.99 | -2.52 | 10.30 | -30.80 | -11.23 | -2.74 | 5.29 | 28.55 | ▁▇▇▅▁ |
| ay | 7332 | 0.99 | 25.62 | 3.75 | 7.63 | 22.80 | 25.62 | 28.38 | 46.79 | ▁▃▇▁▁ |
| az | 7332 | 0.99 | -23.48 | 8.74 | -51.43 | -29.51 | -22.62 | -16.10 | 30.39 | ▂▇▆▁▁ |
| sz_top | 7332 | 0.99 | 3.37 | 0.19 | 2.50 | 3.25 | 3.37 | 3.49 | 4.31 | ▁▂▇▁▁ |
| sz_bot | 7332 | 0.99 | 1.60 | 0.11 | 0.75 | 1.54 | 1.60 | 1.67 | 2.44 | ▁▁▇▁▁ |
| hit_distance_sc | 548516 | 0.26 | 165.22 | 123.61 | 0.00 | 30.00 | 174.00 | 260.00 | 526.00 | ▇▆▅▃▁ |
| launch_speed | 539851 | 0.27 | 83.88 | 14.22 | 7.60 | 73.90 | 82.90 | 95.40 | 125.30 | ▁▁▅▇▂ |
| launch_angle | 539850 | 0.27 | 16.62 | 28.68 | -89.00 | -3.00 | 18.00 | 37.00 | 89.00 | ▁▂▇▇▂ |
| effective_speed | 4889 | 0.99 | 88.04 | 8.04 | 0.00 | 83.90 | 89.50 | 93.30 | 105.00 | ▁▁▁▃▇ |
| release_spin_rate | 20084 | 0.97 | 2255.36 | 323.34 | 419.00 | 2103.00 | 2274.00 | 2444.00 | 3741.00 | ▁▁▇▃▁ |
| release_extension | 7332 | 0.99 | 5.99 | 0.50 | 0.60 | 5.70 | 6.00 | 6.30 | 9.90 | ▁▁▇▅▁ |
| game_pk | 0 | 1.00 | 566671.45 | 4109.91 | 564734.00 | 565456.00 | 566197.00 | 566932.00 | 599377.00 | ▇▁▁▁▁ |
| pitcher.1 | 0 | 1.00 | 571800.29 | 67839.45 | 282332.00 | 519326.00 | 592789.00 | 621381.00 | 677976.00 | ▁▁▅▆▇ |
| release_pos_y | 7332 | 0.99 | 54.51 | 0.50 | 50.59 | 54.17 | 54.50 | 54.83 | 59.94 | ▁▅▇▁▁ |
| launch_speed_angle | 618174 | 0.17 | 3.20 | 1.26 | 1.00 | 2.00 | 3.00 | 4.00 | 6.00 | ▇▆▆▁▂ |
| at_bat_number | 0 | 1.00 | 39.51 | 23.42 | 1.00 | 19.00 | 39.00 | 58.00 | 148.00 | ▇▇▅▁▁ |
| pitch_number | 0 | 1.00 | 2.93 | 1.74 | 1.00 | 1.00 | 3.00 | 4.00 | 16.00 | ▇▂▁▁▁ |
| home_score | 0 | 1.00 | 2.36 | 2.68 | 0.00 | 0.00 | 2.00 | 4.00 | 21.00 | ▇▂▁▁▁ |
| away_score | 0 | 1.00 | 2.56 | 2.84 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| bat_score | 0 | 1.00 | 2.44 | 2.71 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| fld_score | 0 | 1.00 | 2.48 | 2.81 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_away_score | 0 | 1.00 | 2.58 | 2.84 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_home_score | 0 | 1.00 | 2.37 | 2.68 | 0.00 | 0.00 | 2.00 | 4.00 | 21.00 | ▇▂▁▁▁ |
| post_bat_score | 0 | 1.00 | 2.47 | 2.72 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_fld_score | 0 | 1.00 | 2.48 | 2.81 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| spin_axis | 7332 | 0.99 | 178.36 | 68.67 | 0.00 | 134.00 | 194.00 | 225.00 | 360.00 | ▂▃▇▅▁ |
| delta_home_win_exp | 2 | 1.00 | 0.00 | 0.03 | -0.74 | 0.00 | 0.00 | 0.00 | 0.91 | ▁▁▇▁▁ |
| delta_run_exp | 67 | 1.00 | 0.00 | 0.25 | -1.66 | -0.07 | -0.02 | 0.04 | 3.70 | ▁▇▁▁▁ |
Mostly making variables that wouldn’t be useful as character values but might contribute to prediction into factor variables, engineering whether runners are on base at all, engineering whether runners are in scoring position, and engineering difference between home and away score
I’m guessing post_away_score and post_home_score (and their difference!) will matter most for delta_home_win_exp in any model, but I guess we’ll see…
#glimpse(rise_batmen)
# Want to convert things that aren't actually numeric but might be relevant like pitcher and batter id to factors, engineer whether runners are on base or in scoring position
fancy_batmen <- rise_batmen %>%
mutate(across(c(pitch_type, batter, pitcher, events, description, game_type:type,inning_topbot,
if_fielding_alignment,of_fielding_alignment,game_year,game_pk,pitch_name), as.factor), # Converting either numeric or character to factors
on_base = factor(case_when(
!is.na(on_3b) | !is.na(on_2b) | !is.na(on_1b) ~ "Yes",
TRUE ~ "No"
)),
run_score_pos = factor(case_when(
!is.na(on_3b) | !is.na(on_2b) ~ "Yes",
TRUE ~ "No"
)),
post_delta_score = post_home_score - post_away_score) %>%
dplyr::select(-player_name,-des,-c(on_3b:on_1b),-pitcher.1,-batter_name,-game_year) ## Leave out variables that can't really contribute beyond other features or just aren't relevant
Alright, had to take care of some work stuff, I spent too long feature engineering, so let’s get to some initial visualization of correlations between retained numeric predictors.
There are some strong correlations here, but nothing so extreme I feel like I have to do dimension reduction (Though if I have time might revisit that later)
## looking at correlations between predictors
cor_mat <- fancy_batmen %>%
na.omit() %>% # Not ideal to just drop NAs, but trying to go faster here to get a general sense of what's happening
dplyr::select(is.numeric, -delta_home_win_exp) %>%
cor()
cor_map <-
heatmaply_cor(
cor_mat,
symm = TRUE,
cexRow = .0001,
cexCol = .0001,
branches_lwd = .1
)
cor_map
Let’s also look at associations between each of the numeric predictors and outcome. This code takes a while to run, so I’m going to be working on my modeling code as this is happening. Looking at these associations, basically all of them have flat associations with win expectancy, including engineered features. I’m likely going to need a model that can pick up on high level interactions if I want to have a shot at decent prediction. (And I might still not!)
# Do it once
fancy_batmen %>%
ggplot(aes(x = post_delta_score, y = delta_home_win_exp)) +
geom_point(alpha = 0.2, position = "jitter") +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, col = "red") +
labs(y = "Change in Home Team Winning Expectation")
# Write a function
for_cor_plotting <- fancy_batmen %>%
dplyr::select(is.numeric, -delta_home_win_exp) %>%
names()
map(for_cor_plotting, ~{
fancy_batmen %>%
ggplot(aes(x = .data[[.x]], y = delta_home_win_exp)) +
geom_point(alpha = 0.2, position = "jitter") +
geom_smooth(method = lm, formula = y ~ x, se = FALSE, col = "red") +
labs(y = "Change in Home Team Winning Expectation")
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
## Original model was taking way too long to run, so to make sure I have something within 2 hours I'm going to scale down to numeric predictors only
numbers_batmen <- fancy_batmen %>%
dplyr::select(where(is.numeric)) %>%
filter(!is.na(delta_home_win_exp))
skim(numbers_batmen) # Looking here becaue I think imputation is taking too long
| Name | numbers_batmen |
| Number of rows | 743354 |
| Number of columns | 44 |
| _______________________ | |
| Column type frequency: | |
| numeric | 44 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| release_speed | 7311 | 0.99 | 88.69 | 5.93 | 50.60 | 84.50 | 89.80 | 93.40 | 104.30 | ▁▁▂▇▃ |
| release_pos_x | 7332 | 0.99 | -0.76 | 1.96 | -5.38 | -2.16 | -1.53 | 1.23 | 4.98 | ▁▇▂▃▁ |
| release_pos_z | 7332 | 0.99 | 5.90 | 0.50 | 0.11 | 5.63 | 5.92 | 6.22 | 10.54 | ▁▁▇▂▁ |
| zone | 7332 | 0.99 | 9.24 | 4.19 | 1.00 | 6.00 | 11.00 | 13.00 | 14.00 | ▂▃▂▃▇ |
| balls | 0 | 1.00 | 0.89 | 0.97 | 0.00 | 0.00 | 1.00 | 2.00 | 4.00 | ▇▅▃▂▁ |
| strikes | 0 | 1.00 | 0.90 | 0.83 | 0.00 | 0.00 | 1.00 | 2.00 | 2.00 | ▇▁▆▁▆ |
| pfx_x | 7332 | 0.99 | -0.15 | 0.86 | -2.66 | -0.87 | -0.24 | 0.53 | 2.38 | ▁▇▇▆▁ |
| pfx_z | 7332 | 0.99 | 0.65 | 0.74 | -2.21 | 0.22 | 0.79 | 1.25 | 5.39 | ▁▇▇▁▁ |
| plate_x | 7332 | 0.99 | 0.04 | 0.85 | -5.03 | -0.54 | 0.04 | 0.62 | 6.29 | ▁▃▇▁▁ |
| plate_z | 7332 | 0.99 | 2.25 | 0.96 | -4.13 | 1.62 | 2.25 | 2.87 | 12.21 | ▁▇▆▁▁ |
| outs_when_up | 0 | 1.00 | 0.98 | 0.82 | 0.00 | 0.00 | 1.00 | 2.00 | 2.00 | ▇▁▇▁▇ |
| inning | 0 | 1.00 | 5.03 | 2.69 | 1.00 | 3.00 | 5.00 | 7.00 | 19.00 | ▇▇▂▁▁ |
| hc_x | 616188 | 0.17 | 126.56 | 41.17 | 2.00 | 102.43 | 125.66 | 153.76 | 246.88 | ▁▃▇▅▁ |
| hc_y | 616188 | 0.17 | 120.52 | 44.69 | 2.30 | 85.14 | 122.97 | 157.66 | 230.96 | ▁▅▅▇▁ |
| vx0 | 7332 | 0.99 | 2.32 | 5.99 | -19.92 | -2.55 | 4.12 | 6.70 | 18.68 | ▁▃▃▇▁ |
| vy0 | 7332 | 0.99 | -128.94 | 8.61 | -151.33 | -135.78 | -130.60 | -122.86 | -73.14 | ▃▇▂▁▁ |
| vz0 | 7332 | 0.99 | -4.22 | 3.00 | -19.92 | -6.25 | -4.36 | -2.34 | 15.52 | ▁▃▇▁▁ |
| ax | 7332 | 0.99 | -2.52 | 10.30 | -30.80 | -11.23 | -2.74 | 5.29 | 28.55 | ▁▇▇▅▁ |
| ay | 7332 | 0.99 | 25.62 | 3.75 | 7.63 | 22.80 | 25.62 | 28.38 | 46.79 | ▁▃▇▁▁ |
| az | 7332 | 0.99 | -23.48 | 8.74 | -51.43 | -29.51 | -22.62 | -16.10 | 30.39 | ▂▇▆▁▁ |
| sz_top | 7332 | 0.99 | 3.37 | 0.19 | 2.50 | 3.25 | 3.37 | 3.49 | 4.31 | ▁▂▇▁▁ |
| sz_bot | 7332 | 0.99 | 1.60 | 0.11 | 0.75 | 1.54 | 1.60 | 1.67 | 2.44 | ▁▁▇▁▁ |
| hit_distance_sc | 548516 | 0.26 | 165.22 | 123.61 | 0.00 | 30.00 | 174.00 | 260.00 | 526.00 | ▇▆▅▃▁ |
| launch_speed | 539851 | 0.27 | 83.88 | 14.22 | 7.60 | 73.90 | 82.90 | 95.40 | 125.30 | ▁▁▅▇▂ |
| launch_angle | 539850 | 0.27 | 16.62 | 28.68 | -89.00 | -3.00 | 18.00 | 37.00 | 89.00 | ▁▂▇▇▂ |
| effective_speed | 4889 | 0.99 | 88.04 | 8.04 | 0.00 | 83.90 | 89.50 | 93.30 | 105.00 | ▁▁▁▃▇ |
| release_spin_rate | 20084 | 0.97 | 2255.36 | 323.34 | 419.00 | 2103.00 | 2274.00 | 2444.00 | 3741.00 | ▁▁▇▃▁ |
| release_extension | 7332 | 0.99 | 5.99 | 0.50 | 0.60 | 5.70 | 6.00 | 6.30 | 9.90 | ▁▁▇▅▁ |
| release_pos_y | 7332 | 0.99 | 54.51 | 0.50 | 50.59 | 54.17 | 54.50 | 54.83 | 59.94 | ▁▅▇▁▁ |
| launch_speed_angle | 618173 | 0.17 | 3.20 | 1.26 | 1.00 | 2.00 | 3.00 | 4.00 | 6.00 | ▇▆▆▁▂ |
| at_bat_number | 0 | 1.00 | 39.51 | 23.42 | 1.00 | 19.00 | 39.00 | 58.00 | 148.00 | ▇▇▅▁▁ |
| pitch_number | 0 | 1.00 | 2.93 | 1.74 | 1.00 | 1.00 | 3.00 | 4.00 | 16.00 | ▇▂▁▁▁ |
| home_score | 0 | 1.00 | 2.36 | 2.68 | 0.00 | 0.00 | 2.00 | 4.00 | 21.00 | ▇▂▁▁▁ |
| away_score | 0 | 1.00 | 2.56 | 2.84 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| bat_score | 0 | 1.00 | 2.44 | 2.71 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| fld_score | 0 | 1.00 | 2.48 | 2.81 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_away_score | 0 | 1.00 | 2.58 | 2.84 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_home_score | 0 | 1.00 | 2.37 | 2.68 | 0.00 | 0.00 | 2.00 | 4.00 | 21.00 | ▇▂▁▁▁ |
| post_bat_score | 0 | 1.00 | 2.47 | 2.72 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| post_fld_score | 0 | 1.00 | 2.48 | 2.81 | 0.00 | 0.00 | 2.00 | 4.00 | 23.00 | ▇▂▁▁▁ |
| spin_axis | 7332 | 0.99 | 178.36 | 68.67 | 0.00 | 134.00 | 194.00 | 225.00 | 360.00 | ▂▃▇▅▁ |
| delta_home_win_exp | 0 | 1.00 | 0.00 | 0.03 | -0.74 | 0.00 | 0.00 | 0.00 | 0.91 | ▁▁▇▁▁ |
| delta_run_exp | 66 | 1.00 | 0.00 | 0.25 | -1.66 | -0.07 | -0.02 | 0.04 | 3.70 | ▁▇▁▁▁ |
| post_delta_score | 0 | 1.00 | -0.21 | 3.35 | -21.00 | -2.00 | 0.00 | 1.00 | 20.00 | ▁▁▇▁▁ |
numbers_batmen <- numbers_batmen %>% # Removing some features that might be relevant but just have to much missing data to do knn imputation in a reasonable amount of time (I don't think mean or median imputation makes sense for a lot of these, though I could be wrong. Weeee so little domain knowledge!!)
dplyr::select(-c(hc_x, hc_y, hit_distance_sc:launch_angle, launch_speed_angle))
sliced_recipe <-
recipe(delta_home_win_exp ~ ., data = numbers_batmen) %>%
#step_dummy(all_nominal()) %>% # Don't need this anymore since no nominal predictors
step_impute_median(all_numeric(),-all_outcomes()) # Switched to this since some distributions were skewed and knn imputation was taking too long given time constraints
## Checking to make sure the recipe will work in the model
pred_batmen <- sliced_recipe %>%
prep(verbose = TRUE) %>%
bake(new_data = numbers_batmen) %>%
print()
## oper 1 step impute median [training]
## The retained training set is ~ 215.52 Mb in memory.
##
## # A tibble: 743,354 x 38
## release_speed release_pos_x release_pos_z zone balls strikes pfx_x pfx_z
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 87.9 -2.65 5.5 14 3 2 0.02 0.21
## 2 95.9 -2.77 5.52 7 3 2 -0.57 1.52
## 3 96.5 -2.68 5.42 14 2 2 -0.66 1.4
## 4 96 -2.65 5.55 9 2 1 -0.81 1.5
## 5 86.7 -2.73 5.59 14 1 1 -0.05 0.47
## 6 95.8 -2.91 5.42 7 1 0 -0.83 1.49
## 7 95.8 -2.76 5.55 14 0 0 -0.7 1.48
## 8 95.7 -2.68 5.61 5 0 2 -0.69 1.47
## 9 95.5 -2.79 5.57 3 0 1 -0.94 1.4
## 10 96.3 -2.83 5.55 5 0 0 -0.83 1.43
## # … with 743,344 more rows, and 30 more variables: plate_x <dbl>,
## # plate_z <dbl>, outs_when_up <dbl>, inning <dbl>, vx0 <dbl>, vy0 <dbl>,
## # vz0 <dbl>, ax <dbl>, ay <dbl>, az <dbl>, sz_top <dbl>, sz_bot <dbl>,
## # effective_speed <dbl>, release_spin_rate <dbl>, release_extension <dbl>,
## # release_pos_y <dbl>, at_bat_number <dbl>, pitch_number <dbl>,
## # home_score <dbl>, away_score <dbl>, bat_score <dbl>, fld_score <dbl>,
## # post_away_score <dbl>, post_home_score <dbl>, post_bat_score <dbl>,
## # post_fld_score <dbl>, spin_axis <dbl>, delta_run_exp <dbl>,
## # post_delta_score <dbl>, delta_home_win_exp <dbl>
# Creating random forest model
# rf_mod <- rand_forest() %>%
# set_engine("ranger") %>%
# set_mode("regression")
# Creating gradient-boosted tree model
xg_mod <- boost_tree() %>%
set_engine("xgboost") %>%
set_mode("regression")
# Creating linear regression
lm_mod <- linear_reg() %>%
set_engine("lm")
# Create a list of preprocessing recipes
base_model_rec_list <- list(sliced_recipe, sliced_recipe)
base_model_mod_list <- list(lm_mod, xg_mod)
# Combining each recipe into a tidymodels workflows using a map function
base_model_wfs <- map2(base_model_rec_list, base_model_mod_list, ~{
wf <- workflow() %>%
add_recipe(.x) %>%
add_model(.y)
}
)
# Create as list of dataframes we'll be using to fit these models
base_model_data_list <- list(numbers_batmen, numbers_batmen)
# Fitting each model once to make sure they run, and they do (Note how long fitting one model takes for each as well, adjusting cross-validation for that reason)
base_model_one_time_fit <- map2(base_model_wfs, base_model_data_list, ~{
registerDoMC(cores = 7)
tic()
ema_rf_wf_fit <- fit(.x, data = .y)
toc()
ema_rf_wf_fit
})
## 3.681 sec elapsed
## 48.468 sec elapsed
Trying to run both models at once so everything else (pulling metrics, plotting predictions vs. actual values) can happen all at once! I know workflowsets exists now but I haven’t had a chance to really dive in on how to use it, so I’m using these manual map functions for now. I feel like workflowsets would probably save me time in the competition, but definitely not right now since I wouldn’t know wtf I was doing.
base_model_fit_all_rs <- map2(base_model_wfs, base_model_data_list, ~{
registerDoMC(cores = 7)
set.seed(33)
folds_pred <- vfold_cv(.y, v = 4, repeats = 4, strata = delta_home_win_exp)
## Run the CV models here
keep_pred <- control_resamples(save_pred = TRUE)
tic()
set.seed(33)
rf_fit_rs <-
.x %>%
fit_resamples(folds_pred, control = keep_pred)
toc()
rf_fit_rs
}
)
## 42.678 sec elapsed
## 170.062 sec elapsed
I originally wanted to chpose both random forest and boosted tree models because I saw there was essentially no linear association between any of the numeric predictors and the outcome, but the random forest ended up being way too compuationally expensive to fit within the time limit (Had to scramble at the end to get models that woudl just fit!).
I ended up fitting a simple linear model and the boosted tree model. The huge pro here is way reduced computation time, plus in lots of cases even much fancier, more computationally expensive models don’t actually improve fit by that much. A big con is it can’t pick up on interactions, and as I saw from visualiztions the univariate relationships between predictors and the outcome were pretty weak. I’m relying on lots of small predictors adding up in that model. The huge pro of the boosted tree is being able to pick up on those higher-order interactions, though I definitely had to reduce my number of cross-validaitons and repeats to get it to fit into the time limit (so a con there since it effects how stable my measures of out of sample accuracy might be for both models!)
The boosted tree model did better than the simple linear model on both RMSE (0.019 BT vs. 0.024 for LM), which is on the same scale as the original metric. Based on my earlier skim these predictions are both inside 1 SD of the outcome, but not by a bunch. I’m sure these predictions aren’t awesome, but they exist!
base_model_metrics <- map(base_model_fit_all_rs, ~{
.x %>%
collect_metrics(summarize = TRUE)
}) %>%
print()
## [[1]]
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.0236 16 0.0000658 Preprocessor1_Model1
## 2 rsq standard 0.317 16 0.00122 Preprocessor1_Model1
##
## [[2]]
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.0191 16 0.0000716 Preprocessor1_Model1
## 2 rsq standard 0.570 16 0.00214 Preprocessor1_Model1
The linear model obviously isn’t doing super hot, but even the boosted tree model breaks down a lot at the extremes + isn’t that awesome in the middle either. Looking back I wonder if some kind of zero-inflated model would have done much better (though that would be tricky since negative values are possible, it’s not a count variable)
base_model_preds_all_rs <- map(base_model_fit_all_rs, ~{
preds <- .x %>%
collect_predictions(summarize = TRUE)
})
map(base_model_preds_all_rs, ~{
.x %>%
ggplot(aes(x = .pred, y = delta_home_win_exp)) +
geom_point(alpha = 0.2, position = "jitter")
}
)
## [[1]]
##
## [[2]]
This was super fun and really wild! I definitely think I’ll end up using less time-intensive stuff earlier if I make it onto Sliced, try to get an initial model running, and then go back to see if I can improve upon it. I think I avoided some pitfalls in feature engineering ok, but I didn’t anticipate my models taking so long to run later, so I have to better budget model running time in the future (though I was able to write my metrics/visualize predictions code while the model was running, and I think that helped). I even had to take a break in the middle to handle some work stuff (I recognize not ideal/realistic), but I think work-time I spent almost exactly (yikes!) 2 hours on this code. I had to make way more tradeoffs in the models/preprocessing I wanted to do vs. the models/preprocessing that I knew could happen with 2 hours than I thought I would! I think I learned a lot even just from doing this screener, so thanks for making it available!